#!/usr/bin python3
# -*- encoding: utf-8 -*-
'''
@File    :   charcorrect.py
@Time    :   2020/07/28 11:27:03
@Author  :   陈培杞
@Version :   1.0
@docs    :  
        原始数据集样本格式
            * wp_2gram.txt:
                1 the git
                2 in the

            * eml.txt
                Héllo I'm a programmér who crackéd your émail account and dévicé about

        过程：
            1. 将原始数据结构化为dict —— 按字符串长度进行分桶处理。{2:['aa','bb'], 8:['isthethe','aboutyou']}
            2. 将字符转为ASCII码
            3. 处理成 faiss 模块的输入形式，分桶搜索
'''

from pathlib import Path
project_path = Path(__file__).parent.resolve()

import re
import ast
import json
import numpy as np
from collections import defaultdict

class TrieTree(object):
    """
        用于模糊搜索
    """

    def __init__(self):
        self.root={}
        self.word_end = -1
    
    def add(self,word):
        nownode=self.root
        for char in word:
            if char not in nownode:
                nownode[char] = {}
            nownode = nownode[char]
        nownode[self.word_end] = True

    def search(self, word):
        nownode = self.root
        for char in word:
            if  char not in nownode:
                return False
            nownode = nownode[char]
        
        if self.word_end not in nownode:
            return False
        
        return True 

    def fuzzyMatch(self, tree, word):
        nowNode = tree
        matchWord = ''
        matchWordList=[]
        for i,char in enumerate(word):
            if char  in nowNode:
                matchWord += char
                nowNode = nowNode[char]
            else:
                if ord(char) <= 122 :  # 既不在树中，也不是通配符，返回空串
                    return ['']
                else: # 通配符处理
                    for key in nowNode.keys():
                        nextNodes = nowNode[key]
                        if key == self.word_end:
                            return ['']
                        NextMatchWordList = self.fuzzyMatch(nextNodes, word[i+1:])                        
                        matchWordList += [ matchWord + key + chars for chars in NextMatchWordList]


        if matchWordList :
            return matchWordList
        else:
            return [matchWord]


    def getTree(self):
        return self.root






def calculateSimilarity( searchDict:dict, charlength=5):
    key = charlength
    dict_eml  = getProcessedData(f"{EmlSaveFile}_{charlength}")
    dict_wiki = getProcessedData(f"{WikiSaveFile}_{charlength}")

    # trie tree 检索
    mytree = TrieTree()
    for word in dict_wiki[key]:
        mytree.add(word)

    for word in dict_eml[key]:
        matchResult = mytree.fuzzyMatch(mytree.getTree(), word)
        matchResult = [ i for i in matchResult if len(i)==len(word) ]
        searchDict[word] = list(set(matchResult))



if __name__=='__main__':
        searchDict = dict()
        calculateSimilarity(searchDict)
